Beispiel #1
0
        private float ProcessInternal(string text)
#endif
        {
#if DEBUG
            var hasCyrillicLetters = rld_tokenizer.HasCyrillicLetters(text, _CyrillicLettersPercent, out cyrillicLettersPercent);
#else
            var hasCyrillicLetters = rld_tokenizer.HasCyrillicLetters(text, _CyrillicLettersPercent);
#endif
            if (!hasCyrillicLetters)
            {
                return(NULL_WEIGHT);
            }

            var terms = _Tokenizer.Run(text);
            if (terms.Count == 0)
            {
                return(NULL_WEIGHT);
            }

            var termCount          = 0;
            var termCountInHashset = 0;
            var termPrevious       = terms[0];
            if (_Model.Contains(termPrevious))
            {
                termCountInHashset++;
            }
            for (int i = 1, len = terms.Count; i < len; i++)
            {
                var term = terms[i];
                if (_Model.Contains(term))
                {
                    termCountInHashset++;
                }
                var ngram_2 = _Sb.Clear().Append(termPrevious).Append(' ').Append(term).ToString();
                if (_Model.Contains(ngram_2))
                {
                    termCountInHashset++;
                }
                termPrevious = term;

                termCount++;
            }

            if (termCount == 0)
            {
                return(NULL_WEIGHT);
            }

            var totalWeight = (1.0f * termCountInHashset) / termCount;
            return(totalWeight);
        }
Beispiel #2
0
        public static void Build(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY)
        {
            #region [.-0-.]
            Console.WriteLine($"start process folder: '{bp.InputDirectory}'...");

            var tokenizer   = new mld_tokenizer(bp.UrlDetectorModel);
            var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity);

#if DEBUG
            var skipWordCount = 0;
#endif
            var processWordAction = default(Action <string>);
            if (bp.ClearCyrillicsChars)
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsOrDigitsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (!word.HasCyrillicsChars())
                        {
                            if (word.Length <= bp.SingleWordMaxLength)
                            {
                                tfProcessor.AddTerm(word);
                            }
#if DEBUG
                            else
                            {
                                Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                            }
#endif
                        }
                    };
                }
            }
            else
            {
                if (bp.ClearDigitsChars)
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            if (!word.HasDigitsChars())
                            {
                                tfProcessor.AddTerm(word);
                            }
                        }
#if DEBUG
                        else
                        {
                            Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
                else
                {
                    processWordAction = (word) =>
                    {
                        if (word.Length <= bp.SingleWordMaxLength)
                        {
                            tfProcessor.AddTerm(word);
                        }
#if DEBUG
                        else
                        {
                            Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'");
                        }
#endif
                    };
                }
            }
            #endregion

            #region [.-1-.]
            var totalSentenceCount = 0;
            var first_fi           = default(FileInfo);
            var fis    = bp.EnumerateFilesFromInputFolder();
            var fi_num = 0;
            foreach (var fi in fis)
            {
                if (first_fi == null)
                {
                    first_fi = fi;
                }

                Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]...");

                using (var sr = new StreamReader(fi.FullName, Config.Inst.INPUT_ENCODING))
                {
                    for (var line = sr.ReadLine(); line != null; line = sr.ReadLine())
                    {
                        tokenizer.Run(line, processWordAction);

                        #region [.print-2-console.]
                        if ((++totalSentenceCount % 100_000) == 0)
                        {
                            Console.Write('.');
                            if ((totalSentenceCount % 1_000_000) == 0)
                            {
                                Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {tfProcessor.DictionarySize}");
                            }
                        }
                        #endregion
                    }
                    #region [.print-2-console.]
                    Console.WriteLine($"total-sentence-count: {totalSentenceCount}");
                    #endregion
                }
                GC.Collect();
                Console.WriteLine("end process file");
            }

            if (first_fi == null)
            {
                throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'"));
            }
            #endregion

            #region [.-2-.]
            Console.Write("start calc probability...");
            var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT);
            tfProcessor = default(TFProcessor);
            GC.Collect();
            Console.WriteLine("end calc probability");
            #endregion

            #region [.-3-.]
            Console.Write("start write result...");
            if (!Directory.Exists(bp.OutputDirectory))
            {
                Directory.CreateDirectory(bp.OutputDirectory);
            }

            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };

            var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}");

            using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING))
            {
                sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)");

                foreach (var tp in probabilityResult)
                {
                    if (tp.Probability != 0)
                    {
                        sw.Write(tp.Term);
                        sw.Write('\t');
                        sw.WriteLine(tp.Probability.ToString(nfi));
                    }
                }
            }

            Console.WriteLine($"end write result{Environment.NewLine}");
            #endregion
        }
Beispiel #3
0
        public void BuildTFMatrix_UsePortion()
        {
            using (var tokenizer = new mld_tokenizer(_Bp.UrlDetectorModel))
            {
                //-1-
                var processWordAction = default(Action <string>);
                if (_Bp.ClearCyrillicsChars)
                {
                    if (_Bp.ClearDigitsChars)
                    {
                        processWordAction = ProcessWordActionClearCyrillicsAndDigitsChars;
                    }
                    else
                    {
                        processWordAction = ProcessWordActionClearCyrillicsChars;
                    }
                }
                else
                {
                    if (_Bp.ClearDigitsChars)
                    {
                        processWordAction = ProcessWordActionClearDigitsChars;
                    }
                    else
                    {
                        processWordAction = ProcessWordAction;
                    }
                }

                var totalSentenceCount = 0;
                using (var sr = new StreamReader(_Fi.FullName, Config.Inst.INPUT_ENCODING))
                {
                    for (var line = sr.ReadLine(); line != null; line = sr.ReadLine())
                    {
                        tokenizer.Run(line, processWordAction);
                        totalSentenceCount++;

                        #region [.print sentence-count.]
                        if ((totalSentenceCount % 100_000) == 0)
                        {
                            Console.Write('.');
                            if ((totalSentenceCount % 1_000_000) == 0)
                            {
                                Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {_DocumentNgrams_1.Count}");
                            }
                        }
                        #endregion
                    }
                    #region [.print sentence-count.]
                    Console.WriteLine($"total-sentence-count: {totalSentenceCount}");
                    #endregion

                    ProcessLastAction();

                    if ((_OutputFilenames[NGramsEnum.ngram_1].Count == 0) && (_DocumentNgrams_1.Count == 0))
                    {
                        throw (new InvalidDataException($"input text is-null-or-white-space, filename: '{_Fi.FullName}'"));
                    }
                }

                //-2-
                ProcessNgrams();
            }
        }
Beispiel #4
0
        unsafe private LanguageInfo[] ProcessWithTermEnumerable(string text)
        {
            fixed(float *weightsPtrBase = _Weights)
            fixed(int *termCountByLanguagePtrBase = _TermCountByLanguage)
            {
                //-1-
                #region [.main phase.]
                _WeightsPtrBase             = weightsPtrBase;
                _TermCountByLanguagePtrBase = termCountByLanguagePtrBase;
                _TermCount          = 0;
                _TermCountDetecting = 0;

                //zeroize
                for (var i = 0; i < LANGUAGES_COUNT; i++)
                {
                    weightsPtrBase[i]             = 0;
                    termCountByLanguagePtrBase[i] = 0;
                }

                _Tokenizer.Run(text, _ProcessTermCallbackAction);
                _TermPrevious = null;

                for (var i = 0; i < LANGUAGES_COUNT; i++)
                {
                    var weightPtr = weightsPtrBase + i;
                    *   weightPtr = (*weightPtr * termCountByLanguagePtrBase[i]) / _TermCount;
                }
                #endregion

                //-2-
                #region [.form result.]
                //если в тексте более чем из 9 слов определилось не более 10% слов, то отбрасывать его в unk.
                if ((_ThresholdDetectingWordCount < _TermCount) &&
                    ((100 * _TermCountDetecting / _TermCount) < _ThresholdPercentDetectingWordCount)
                    )
                {
                    return(LANGUAGEINFO_EMPTY);
                }

                var weightsSum = default(float);
                for (var i = 0; i < LANGUAGES_COUNT; i++)
                {
                    weightsSum += weightsPtrBase[i];
                }

                if (weightsSum == NULL_WEIGHT)
                {
                    return(LANGUAGEINFO_EMPTY);
                }

                var aspect = 100 / weightsSum;
                for (var i = 0; i < LANGUAGES_COUNT; i++)
                {
                    var weight = weightsPtrBase[i];
                    if (weight < _ThresholdAbsoluteWeightLanguage)
                    {
                        continue;
                    }
                    var percent = (int)(weight * aspect);                 //Convert.ToInt32( weight * aspect );
                    if (percent < _ThresholdPercent)
                    {
                        continue;
                    }

                    var language = ((Language)i);

                    _LanguageInfos.Add(new LanguageInfo(language, weight, percent));
                }

                _LanguageInfos.Sort(LanguageInfoComparer.Instance);

                //Если 3 и более языков, начиная с первого отличаются не более чем на 8% между собой, то это либо неизвестный язык, либо сильно смешенный – отбрасывать в unk.
                if (THREE_LANGUAGE <= _LanguageInfos.Count)
                {
                    var p1 = _LanguageInfos[0].Percent;
                    var p2 = _LanguageInfos[1].Percent;
                    if ((p1 - p2) <= _ThresholdPercentBetween3Language)
                    {
                        var p3 = _LanguageInfos[2].Percent;
                        if ((p2 - p3) <= _ThresholdPercentBetween3Language)
                        {
                            return(LANGUAGEINFO_EMPTY);
                        }
                    }
                }

                if (0 < _LanguageInfos.Count)
                {
                    var resultLanguageInfos = _LanguageInfos.ToArray();
                    _LanguageInfos.Clear();
                    return(resultLanguageInfos);
                }
                return(LANGUAGEINFO_EMPTY);

                #endregion
            }
        }