Exemplo n.º 1
0
        public RDetector(RDetectorConfig config, IRModel model)
        {
            config.ThrowIfNull("config");
            model.ThrowIfNull("model");
            config.UrlDetectorModel.ThrowIfNull("config.UrlDetectorModel");

            Threshold = config.Threshold;
            CyrillicLettersPercent = config.CyrillicLettersPercent;
            _Model     = model;
            _Tokenizer = new mld_tokenizer(config.UrlDetectorModel);
            _Sb        = new StringBuilder(50);
        }
Exemplo n.º 2
0
        public MDetector(MDetectorConfig config, IMModel model)
        {
            config.ThrowIfNull("config");
            config.UrlDetectorModel.ThrowIfNull("config.UrlDetectorModel");
            model.ThrowIfNull("model");

            _Model           = model;
            ThresholdPercent = config.ThresholdPercent;
            ThresholdPercentBetween3Language   = config.ThresholdPercentBetween3Language;
            ThresholdDetectingWordCount        = config.ThresholdDetectingWordCount;
            ThresholdPercentDetectingWordCount = config.ThresholdPercentDetectingWordCount;
            ThresholdAbsoluteWeightLanguage    = config.ThresholdAbsoluteWeightLanguage;
            _Tokenizer                 = new mld_tokenizer(config.UrlDetectorModel);
            _Weights                   = new float[LANGUAGES_COUNT];
            _TermCountByLanguage       = new int  [LANGUAGES_COUNT];
            _LanguageInfos             = new List <LanguageInfo>(LANGUAGES_COUNT);
            _NgramStringBuilder        = new StringBuilder(NGRAM_STRINGBUILDER_DEFAULT_LENGTH);
            _ProcessTermCallbackAction = new Action <string>(ProcessTermCallback);
        }
Exemplo n.º 3
0
        public static void Build(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY)
        {
            #region [.-0-.]
            Console.WriteLine($"start process folder: '{bp.InputDirectory}'...");

            var tokenizer   = new mld_tokenizer(bp.UrlDetectorModel);
            var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity);

#if DEBUG
            var skipWordCount = 0;
#endif
            var processWordAction = default(Action <string>);
            if (bp.ClearCyrillicsChars)
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsOrDigitsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
            }
            else
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            if (!word.HasDigitsChars())
                            {
                                tfProcessor.AddTerm(word);
                            }
                        }
#if DEBUG
                        else
                        {
                            Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            tfProcessor.AddTerm(word);
                        }
#if DEBUG
                        else
                        {
                            Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
            }
            #endregion

            #region [.-1-.]
            var totalSentenceCount = 0;
            var first_fi           = default(FileInfo);
            var fis    = bp.EnumerateFilesFromInputFolder();
            var fi_num = 0;
            foreach (var fi in fis)
            {
                if (first_fi == null)
                {
                    first_fi = fi;
                }

                Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]...");

                using (var sr = new StreamReader(fi.FullName, Config.Inst.INPUT_ENCODING))
                {
                    for (var line = sr.ReadLine(); line != null; line = sr.ReadLine())
                    {
                        tokenizer.Run(line, processWordAction);

                        #region [.print-2-console.]
                        if ((++totalSentenceCount % 100_000) == 0)
                        {
                            Console.Write('.');
                            if ((totalSentenceCount % 1_000_000) == 0)
                            {
                                Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {tfProcessor.DictionarySize}");
                            }
                        }
                        #endregion
                    }
                    #region [.print-2-console.]
                    Console.WriteLine($"total-sentence-count: {totalSentenceCount}");
                    #endregion
                }
                GC.Collect();
                Console.WriteLine("end process file");
            }

            if (first_fi == null)
            {
                throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'"));
            }
            #endregion

            #region [.-2-.]
            Console.Write("start calc probability...");
            var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT);
            tfProcessor = default(TFProcessor);
            GC.Collect();
            Console.WriteLine("end calc probability");
            #endregion

            #region [.-3-.]
            Console.Write("start write result...");
            if (!Directory.Exists(bp.OutputDirectory))
            {
                Directory.CreateDirectory(bp.OutputDirectory);
            }

            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };

            var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}");

            using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING))
            {
                sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)");

                foreach (var tp in probabilityResult)
                {
                    if (tp.Probability != 0)
                    {
                        sw.Write(tp.Term);
                        sw.Write('\t');
                        sw.WriteLine(tp.Probability.ToString(nfi));
                    }
                }
            }

            Console.WriteLine($"end write result{Environment.NewLine}");
            #endregion
        }
Exemplo n.º 4
0
        public void BuildTFMatrix_UsePortion()
        {
            using (var tokenizer = new mld_tokenizer(_Bp.UrlDetectorModel))
            {
                //-1-
                var processWordAction = default(Action <string>);
                if (_Bp.ClearCyrillicsChars)
                {
                    if (_Bp.ClearDigitsChars)
                    {
                        processWordAction = ProcessWordActionClearCyrillicsAndDigitsChars;
                    }
                    else
                    {
                        processWordAction = ProcessWordActionClearCyrillicsChars;
                    }
                }
                else
                {
                    if (_Bp.ClearDigitsChars)
                    {
                        processWordAction = ProcessWordActionClearDigitsChars;
                    }
                    else
                    {
                        processWordAction = ProcessWordAction;
                    }
                }

                var totalSentenceCount = 0;
                using (var sr = new StreamReader(_Fi.FullName, Config.Inst.INPUT_ENCODING))
                {
                    for (var line = sr.ReadLine(); line != null; line = sr.ReadLine())
                    {
                        tokenizer.Run(line, processWordAction);
                        totalSentenceCount++;

                        #region [.print sentence-count.]
                        if ((totalSentenceCount % 100_000) == 0)
                        {
                            Console.Write('.');
                            if ((totalSentenceCount % 1_000_000) == 0)
                            {
                                Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {_DocumentNgrams_1.Count}");
                            }
                        }
                        #endregion
                    }
                    #region [.print sentence-count.]
                    Console.WriteLine($"total-sentence-count: {totalSentenceCount}");
                    #endregion

                    ProcessLastAction();

                    if ((_OutputFilenames[NGramsEnum.ngram_1].Count == 0) && (_DocumentNgrams_1.Count == 0))
                    {
                        throw (new InvalidDataException($"input text is-null-or-white-space, filename: '{_Fi.FullName}'"));
                    }
                }

                //-2-
                ProcessNgrams();
            }
        }