public mld_tokenizer(UrlDetectorModel urlModel, int wordCapacity) { var urlConfig = new UrlDetectorConfig() { Model = urlModel, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position, }; _UrlDetector = new UrlDetector(urlConfig); _Words = new List <string>(Math.Max(DEFAULT_WORDCAPACITY, wordCapacity)); _NgramsSB = new StringBuilder(); _AddWordToListAction = new Action <string>(AddWordToList); _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _IAW = UnsafeConst.Inst._INTERPRETE_AS_WHITESPACE; _DWC = UnsafeConst.Inst._DIGIT_WORD_CHARS; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); }
public ClassifierConfig(string urlDetectorResourcesXmlFilename) { UrlDetectorModel = new UrlDetectorModel(urlDetectorResourcesXmlFilename); }
private static void Main(string[] args) { var wasErrors = false; try { #region [.print to console config.] Console.WriteLine($"{Environment.NewLine}----------------------------------------------"); Console.WriteLine($"USE_HIGH_PRIORITY : '{Config.Inst.USE_HIGH_PRIORITY}'"); Console.WriteLine($"NGARMS : '{Config.Inst.NGARMS}'"); Console.WriteLine($"CUT_PERCENT : '{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%'"); #region comm //---Console.WriteLine( $"BUILD_MODE : '{Config.Inst.BUILD_MODE}'" ); /*---switch ( Config.Inst.BUILD_MODE ) * { * case BuildModeEnum.single_model: * Console.WriteLine( $"NGARMS : '{Config.Inst.NGARMS}'" ); * Console.WriteLine( $"CUT_PERCENT : '{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%'" ); * //Console.WriteLine( $"CUT_THRESHOLD : '{Config.Inst.CUT_THRESHOLD}'" ); * break; * }*/ #endregion Console.WriteLine($"INPUT_DIRECTORY : '{Config.Inst.INPUT_DIRECTORY}'"); Console.WriteLine($"INPUT_ENCODING : '{Config.Inst.INPUT_ENCODING.WebName}'"); Console.WriteLine($"CLEAR_CYRILLICS_CHARS : '{Config.Inst.CLEAR_CYRILLICS_CHARS}'"); Console.WriteLine($"CLEAR_DIGITS_CHARS : '{Config.Inst.CLEAR_DIGITS_CHARS}'"); Console.WriteLine($"SINGLE_WORD_MAX_LENGTH : '{Config.Inst.SINGLE_WORD_MAX_LENGTH}'"); Console.WriteLine($"OUTPUT_DIRECTORY : '{Config.Inst.OUTPUT_DIRECTORY}'"); Console.WriteLine($"OUTPUT_ENCODING : '{Config.Inst.OUTPUT_ENCODING.WebName}'"); Console.WriteLine($"USE_PORTION : '{Config.Inst.USE_PORTION}'"); if (Config.Inst.USE_PORTION) { Console.WriteLine($"MAX_PORTION_SIZE : '{Config.Inst.MAX_PORTION_SIZE}'"); } Console.WriteLine($"----------------------------------------------{Environment.NewLine}"); #endregion #region [.use high priority.] if (Config.Inst.USE_HIGH_PRIORITY) { Extensions.SetCurrentProcessHighPriority(); } #endregion #region [.url-detector.] var urlDetectorModel = new UrlDetectorModel(Config.Inst.URL_DETECTOR_RESOURCES_XML_FILENAME); #endregion #region [.build model's.] var bp = new BuildParams_t() { UrlDetectorModel = urlDetectorModel, InputDirectory = Config.Inst.INPUT_DIRECTORY, Ngrams = Config.Inst.NGARMS, CutPercent = Config.Inst.CUT_PERCENT, OutputDirectory = Config.Inst.OUTPUT_DIRECTORY, MaxPortionSize = Config.Inst.MAX_PORTION_SIZE, ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS, ClearDigitsChars = Config.Inst.CLEAR_DIGITS_CHARS, SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH, }; var sw = Stopwatch.StartNew(); if (Config.Inst.USE_PORTION) { ModelBuilder.Build_UsePortion(bp); } else { ModelBuilder.Build(bp); } sw.Stop(); Console.WriteLine($"'{Config.Inst.NGARMS}; cut_{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}"); #region comm /*--if ( Config.Inst.BUILD_MODE == BuildModeEnum.single_model ) * { * var bp = new BuildParams_t() * { * UrlDetectorModel = urlDetectorModel, * InputDirectory = Config.Inst.INPUT_DIRECTORY, * Ngrams = Config.Inst.NGARMS, * CutPercent = Config.Inst.CUT_PERCENT, * OutputDirectory = Config.Inst.OUTPUT_DIRECTORY, * MaxPortionSize = Config.Inst.MAX_PORTION_SIZE, * ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS, * SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH, * }; * var sw = Stopwatch.StartNew(); * if ( Config.Inst.USE_PORTION ) * { * ModelBuilder.Build_UsePortion( bp ); * } * else * { * ModelBuilder.Build( bp ); * } * sw.Stop(); * * Console.WriteLine( $"'{Config.Inst.NGARMS}; cut_{Config.Inst.CUT_PERCENT.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}" ); * } * else * { #region [.build model's.] * var sw_total = Stopwatch.StartNew(); * foreach ( var t in Extensions.GetProcessParams() ) * { * var bp = new BuildParams_t() * { * UrlDetectorModel = urlDetectorModel, * InputDirectory = Config.Inst.INPUT_DIRECTORY, * Ngrams = t.Item1, * CutPercent = TFProcessor.GetCutPercent( t.Item2 ), * OutputDirectory = Config.Inst.OUTPUT_DIRECTORY, * MaxPortionSize = Config.Inst.MAX_PORTION_SIZE, * ClearCyrillicsChars = Config.Inst.CLEAR_CYRILLICS_CHARS, * SingleWordMaxLength = Config.Inst.SINGLE_WORD_MAX_LENGTH, * }; * try * { * var sw = Stopwatch.StartNew(); * if ( Config.Inst.USE_PORTION ) * { * ModelBuilder.Build_UsePortion( bp ); * } * else * { * ModelBuilder.Build( bp ); * } * sw.Stop(); * * Console.WriteLine( $"'{bp.Ngrams}; cut_{bp.CutPercent.GetValueOrDefault()}%' - success, elapsed: {sw.Elapsed}{Environment.NewLine}" ); * } * catch ( Exception ex ) * { * Console.ForegroundColor = ConsoleColor.Red; * Console.WriteLine( $"'{bp.Ngrams}; cut_{bp.CutPercent.GetValueOrDefault()}%' - {ex.GetType()}: {ex.Message}" ); * Console.ResetColor(); * wasErrors = true; * } * } * sw_total.Stop(); * * Console.WriteLine( $"total elapsed: {sw_total.Elapsed}" ); #endregion * }*/ #endregion #endregion } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(Environment.NewLine + ex + Environment.NewLine); Console.ResetColor(); wasErrors = true; } if (wasErrors) { Console.WriteLine($"{Environment.NewLine}[.....finita fusking comedy (push ENTER 4 exit).....]"); Console.ReadLine(); } else { Console.WriteLine($"{Environment.NewLine}[.....finita fusking comedy.....]"); } }
public mld_tokenizer(UrlDetectorModel urlModel) : this(urlModel, DEFAULT_WORDCAPACITY) { }
private static void Main(string[] args) { var wasErrors = false; try { #region [.print to console config.] Console.WriteLine(Environment.NewLine + "----------------------------------------------"); Console.WriteLine("USE_BOOST_PRIORITY: '" + USE_BOOST_PRIORITY + "'"); Console.WriteLine("BUILD_MODE : '" + BUILD_MODE + "'"); switch (BUILD_MODE) { case BuildModeEnum.single_model: Console.WriteLine("METHOD : '" + METHOD + "'"); Console.WriteLine("NGARMS : '" + NGARMS + "'"); Console.WriteLine("D_PARAM : '" + D_PARAM + "'"); break; case BuildModeEnum.all_models_by_method: Console.WriteLine("METHOD : '" + METHOD + "'"); break; } Console.WriteLine("INPUT_FILES : '" + string.Join("'; '", INPUT_FILES) + "'"); Console.WriteLine("INPUT_FOLDER : '" + INPUT_FOLDER + "'"); Console.WriteLine("INPUT_ENCODING : '" + INPUT_ENCODING.WebName + "'"); Console.WriteLine("OUTPUT_FILE : '" + OUTPUT_FILE_PATTERN + "'"); Console.WriteLine("OUTPUT_ENCODING : '" + OUTPUT_ENCODING.WebName + "'"); Console.WriteLine("----------------------------------------------" + Environment.NewLine); #endregion #region [.GC.] GCSettings.LatencyMode = GCLatencyMode.LowLatency; if (GCSettings.LatencyMode != GCLatencyMode.LowLatency) { GCSettings.LatencyMode = GCLatencyMode.Batch; } #endregion #region [.use boost priority.] if (USE_BOOST_PRIORITY) { var pr = Process.GetCurrentProcess(); pr.PriorityClass = ProcessPriorityClass.RealTime; pr.PriorityBoostEnabled = true; Thread.CurrentThread.Priority = ThreadPriority.Highest; } #endregion #region [.url-detector.] var urlDetectorModel = new UrlDetectorModel(URL_DETECTOR_RESOURCES_XML_FILENAME); #endregion #region [.build model's.] if (BUILD_MODE == BuildModeEnum.single_model) { var bp = new build_params_t() { UrlDetectorModel = urlDetectorModel, InputFolder = INPUT_FOLDER, InputFilenames = INPUT_FILES, Method = METHOD, Ngrams = NGARMS, D_param = D_PARAM, OutputFilenamePattern = OUTPUT_FILE_PATTERN, }; var sw = Stopwatch.StartNew(); Build(bp); sw.Stop(); Console.WriteLine("'" + METHOD + "; " + NGARMS + "; " + D_PARAM + "' - success, elapsed: " + sw.Elapsed); } else { var tuples = (BUILD_MODE == BuildModeEnum.all_models_by_method) ? GetProcessParams(METHOD) : GetProcessParams(); #region [.build model's.] var sw_total = Stopwatch.StartNew(); foreach (var t in tuples) { var bp = new build_params_t() { UrlDetectorModel = urlDetectorModel, InputFolder = INPUT_FOLDER, InputFilenames = INPUT_FILES, Method = t.Item1, Ngrams = t.Item2, D_param = t.Item3, OutputFilenamePattern = OUTPUT_FILE_PATTERN, }; try { var sw = Stopwatch.StartNew(); Build(bp); sw.Stop(); Console.WriteLine("'" + bp.Method + "; " + bp.Ngrams + "; " + bp.D_param + "' - success, elapsed: " + sw.Elapsed); } catch (Exception ex) { var fc = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("'" + bp.Method + "; " + bp.Ngrams + "; " + bp.D_param + "' - " + ex.GetType() + ": " + ex.Message); Console.ForegroundColor = fc; wasErrors = true; } } sw_total.Stop(); Console.WriteLine("total elapsed: " + sw_total.Elapsed); #endregion } #endregion } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(Environment.NewLine + ex + Environment.NewLine); Console.ResetColor(); wasErrors = true; } Console.WriteLine(Environment.NewLine + "[.....finita fusking comedy (push ENTER 4 exit).....]"); if (wasErrors) { Console.ReadLine(); } }
public ClassifyTokenizer(UrlDetectorModel urlModel) : this(urlModel, DEFAULT_WORDCAPACITY) { }