private float ProcessInternal(string text) #endif { #if DEBUG var hasCyrillicLetters = rld_tokenizer.HasCyrillicLetters(text, _CyrillicLettersPercent, out cyrillicLettersPercent); #else var hasCyrillicLetters = rld_tokenizer.HasCyrillicLetters(text, _CyrillicLettersPercent); #endif if (!hasCyrillicLetters) { return(NULL_WEIGHT); } var terms = _Tokenizer.Run(text); if (terms.Count == 0) { return(NULL_WEIGHT); } var termCount = 0; var termCountInHashset = 0; var termPrevious = terms[0]; if (_Model.Contains(termPrevious)) { termCountInHashset++; } for (int i = 1, len = terms.Count; i < len; i++) { var term = terms[i]; if (_Model.Contains(term)) { termCountInHashset++; } var ngram_2 = _Sb.Clear().Append(termPrevious).Append(' ').Append(term).ToString(); if (_Model.Contains(ngram_2)) { termCountInHashset++; } termPrevious = term; termCount++; } if (termCount == 0) { return(NULL_WEIGHT); } var totalWeight = (1.0f * termCountInHashset) / termCount; return(totalWeight); }
public static void Build(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY) { #region [.-0-.] Console.WriteLine($"start process folder: '{bp.InputDirectory}'..."); var tokenizer = new mld_tokenizer(bp.UrlDetectorModel); var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity); #if DEBUG var skipWordCount = 0; #endif var processWordAction = default(Action <string>); if (bp.ClearCyrillicsChars) { if (bp.ClearDigitsChars) { processWordAction = (word) => { if (!word.HasCyrillicsOrDigitsChars()) { if (word.Length <= bp.SingleWordMaxLength) { tfProcessor.AddTerm(word); } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif } }; } else { processWordAction = (word) => { if (!word.HasCyrillicsChars()) { if (word.Length <= bp.SingleWordMaxLength) { tfProcessor.AddTerm(word); } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif } }; } } else { if (bp.ClearDigitsChars) { processWordAction = (word) => { if (word.Length <= bp.SingleWordMaxLength) { if (!word.HasDigitsChars()) { tfProcessor.AddTerm(word); } } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif }; } else { processWordAction = (word) => { if (word.Length <= bp.SingleWordMaxLength) { tfProcessor.AddTerm(word); } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif }; } } #endregion #region [.-1-.] var totalSentenceCount = 0; var first_fi = default(FileInfo); var fis = bp.EnumerateFilesFromInputFolder(); var fi_num = 0; foreach (var fi in fis) { if (first_fi == null) { first_fi = fi; } Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]..."); using (var sr = new StreamReader(fi.FullName, Config.Inst.INPUT_ENCODING)) { for (var line = sr.ReadLine(); line != null; line = sr.ReadLine()) { tokenizer.Run(line, processWordAction); #region [.print-2-console.] if ((++totalSentenceCount % 100_000) == 0) { Console.Write('.'); if ((totalSentenceCount % 1_000_000) == 0) { Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {tfProcessor.DictionarySize}"); } } #endregion } #region [.print-2-console.] Console.WriteLine($"total-sentence-count: {totalSentenceCount}"); #endregion } GC.Collect(); Console.WriteLine("end process file"); } if (first_fi == null) { throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'")); } #endregion #region [.-2-.] Console.Write("start calc probability..."); var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT); tfProcessor = default(TFProcessor); GC.Collect(); Console.WriteLine("end calc probability"); #endregion #region [.-3-.] Console.Write("start write result..."); if (!Directory.Exists(bp.OutputDirectory)) { Directory.CreateDirectory(bp.OutputDirectory); } var nfi = new NumberFormatInfo() { NumberDecimalSeparator = "." }; var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}"); using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING)) { sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)"); foreach (var tp in probabilityResult) { if (tp.Probability != 0) { sw.Write(tp.Term); sw.Write('\t'); sw.WriteLine(tp.Probability.ToString(nfi)); } } } Console.WriteLine($"end write result{Environment.NewLine}"); #endregion }
public void BuildTFMatrix_UsePortion() { using (var tokenizer = new mld_tokenizer(_Bp.UrlDetectorModel)) { //-1- var processWordAction = default(Action <string>); if (_Bp.ClearCyrillicsChars) { if (_Bp.ClearDigitsChars) { processWordAction = ProcessWordActionClearCyrillicsAndDigitsChars; } else { processWordAction = ProcessWordActionClearCyrillicsChars; } } else { if (_Bp.ClearDigitsChars) { processWordAction = ProcessWordActionClearDigitsChars; } else { processWordAction = ProcessWordAction; } } var totalSentenceCount = 0; using (var sr = new StreamReader(_Fi.FullName, Config.Inst.INPUT_ENCODING)) { for (var line = sr.ReadLine(); line != null; line = sr.ReadLine()) { tokenizer.Run(line, processWordAction); totalSentenceCount++; #region [.print sentence-count.] if ((totalSentenceCount % 100_000) == 0) { Console.Write('.'); if ((totalSentenceCount % 1_000_000) == 0) { Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {_DocumentNgrams_1.Count}"); } } #endregion } #region [.print sentence-count.] Console.WriteLine($"total-sentence-count: {totalSentenceCount}"); #endregion ProcessLastAction(); if ((_OutputFilenames[NGramsEnum.ngram_1].Count == 0) && (_DocumentNgrams_1.Count == 0)) { throw (new InvalidDataException($"input text is-null-or-white-space, filename: '{_Fi.FullName}'")); } } //-2- ProcessNgrams(); } }
unsafe private LanguageInfo[] ProcessWithTermEnumerable(string text) { fixed(float *weightsPtrBase = _Weights) fixed(int *termCountByLanguagePtrBase = _TermCountByLanguage) { //-1- #region [.main phase.] _WeightsPtrBase = weightsPtrBase; _TermCountByLanguagePtrBase = termCountByLanguagePtrBase; _TermCount = 0; _TermCountDetecting = 0; //zeroize for (var i = 0; i < LANGUAGES_COUNT; i++) { weightsPtrBase[i] = 0; termCountByLanguagePtrBase[i] = 0; } _Tokenizer.Run(text, _ProcessTermCallbackAction); _TermPrevious = null; for (var i = 0; i < LANGUAGES_COUNT; i++) { var weightPtr = weightsPtrBase + i; * weightPtr = (*weightPtr * termCountByLanguagePtrBase[i]) / _TermCount; } #endregion //-2- #region [.form result.] //если в тексте более чем из 9 слов определилось не более 10% слов, то отбрасывать его в unk. if ((_ThresholdDetectingWordCount < _TermCount) && ((100 * _TermCountDetecting / _TermCount) < _ThresholdPercentDetectingWordCount) ) { return(LANGUAGEINFO_EMPTY); } var weightsSum = default(float); for (var i = 0; i < LANGUAGES_COUNT; i++) { weightsSum += weightsPtrBase[i]; } if (weightsSum == NULL_WEIGHT) { return(LANGUAGEINFO_EMPTY); } var aspect = 100 / weightsSum; for (var i = 0; i < LANGUAGES_COUNT; i++) { var weight = weightsPtrBase[i]; if (weight < _ThresholdAbsoluteWeightLanguage) { continue; } var percent = (int)(weight * aspect); //Convert.ToInt32( weight * aspect ); if (percent < _ThresholdPercent) { continue; } var language = ((Language)i); _LanguageInfos.Add(new LanguageInfo(language, weight, percent)); } _LanguageInfos.Sort(LanguageInfoComparer.Instance); //Если 3 и более языков, начиная с первого отличаются не более чем на 8% между собой, то это либо неизвестный язык, либо сильно смешенный – отбрасывать в unk. if (THREE_LANGUAGE <= _LanguageInfos.Count) { var p1 = _LanguageInfos[0].Percent; var p2 = _LanguageInfos[1].Percent; if ((p1 - p2) <= _ThresholdPercentBetween3Language) { var p3 = _LanguageInfos[2].Percent; if ((p2 - p3) <= _ThresholdPercentBetween3Language) { return(LANGUAGEINFO_EMPTY); } } } if (0 < _LanguageInfos.Count) { var resultLanguageInfos = _LanguageInfos.ToArray(); _LanguageInfos.Clear(); return(resultLanguageInfos); } return(LANGUAGEINFO_EMPTY); #endregion } }