public static void Build(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY) { #region [.-0-.] Console.WriteLine($"start process folder: '{bp.InputDirectory}'..."); var tokenizer = new mld_tokenizer(bp.UrlDetectorModel); var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity); #if DEBUG var skipWordCount = 0; #endif var processWordAction = default(Action <string>); if (bp.ClearCyrillicsChars) { if (bp.ClearDigitsChars) { processWordAction = (word) => { if (!word.HasCyrillicsOrDigitsChars()) { if (word.Length <= bp.SingleWordMaxLength) { tfProcessor.AddTerm(word); } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif } }; } else { processWordAction = (word) => { if (!word.HasCyrillicsChars()) { if (word.Length <= bp.SingleWordMaxLength) { tfProcessor.AddTerm(word); } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif } }; } } else { if (bp.ClearDigitsChars) { processWordAction = (word) => { if (word.Length <= bp.SingleWordMaxLength) { if (!word.HasDigitsChars()) { tfProcessor.AddTerm(word); } } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif }; } else { processWordAction = (word) => { if (word.Length <= bp.SingleWordMaxLength) { tfProcessor.AddTerm(word); } #if DEBUG else { Console.Write($"\r\nskip-by-len #{(++skipWordCount)}: '{word}'"); } #endif }; } } #endregion #region [.-1-.] var totalSentenceCount = 0; var first_fi = default(FileInfo); var fis = bp.EnumerateFilesFromInputFolder(); var fi_num = 0; foreach (var fi in fis) { if (first_fi == null) { first_fi = fi; } Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]..."); using (var sr = new StreamReader(fi.FullName, Config.Inst.INPUT_ENCODING)) { for (var line = sr.ReadLine(); line != null; line = sr.ReadLine()) { tokenizer.Run(line, processWordAction); #region [.print-2-console.] if ((++totalSentenceCount % 100_000) == 0) { Console.Write('.'); if ((totalSentenceCount % 1_000_000) == 0) { Console.WriteLine($"sentence-count: {totalSentenceCount}, ngrams_1-count: {tfProcessor.DictionarySize}"); } } #endregion } #region [.print-2-console.] Console.WriteLine($"total-sentence-count: {totalSentenceCount}"); #endregion } GC.Collect(); Console.WriteLine("end process file"); } if (first_fi == null) { throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'")); } #endregion #region [.-2-.] Console.Write("start calc probability..."); var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT); tfProcessor = default(TFProcessor); GC.Collect(); Console.WriteLine("end calc probability"); #endregion #region [.-3-.] Console.Write("start write result..."); if (!Directory.Exists(bp.OutputDirectory)) { Directory.CreateDirectory(bp.OutputDirectory); } var nfi = new NumberFormatInfo() { NumberDecimalSeparator = "." }; var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}"); using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING)) { sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)"); foreach (var tp in probabilityResult) { if (tp.Probability != 0) { sw.Write(tp.Term); sw.Write('\t'); sw.WriteLine(tp.Probability.ToString(nfi)); } } } Console.WriteLine($"end write result{Environment.NewLine}"); #endregion }
public static void Build_UsePortion(BuildParams_t bp, int tfProcessorDictionaryCapacity = TFPROCESSOR_DICTIONARY_CAPACITY) { #region [.-0-.] Console.WriteLine($"start process folder: '{bp.InputDirectory}'..."); var tfProcessor = TFProcessor.Create(tfProcessorDictionaryCapacity); #endregion #region [.-1-.] var first_fi = default(FileInfo); var fis = bp.EnumerateFilesFromInputFolder(); var fi_num = 0; foreach (var fi in fis) { if (first_fi == null) { first_fi = fi; } Console.WriteLine($"{(++fi_num)}). start process file: '{fi.Name}' [{fi.DisplaySize()}]..."); BuildTFMatrix_UsePortion(bp, fi, tfProcessor); Console.WriteLine($"end process file{Environment.NewLine}"); } if (first_fi == null) { throw (new InvalidDataException($"No .txt-files found by path: '{bp.InputDirectory}'")); } #endregion #region [.-2-.] Console.Write("start calc probability..."); var probabilityResult = tfProcessor.CalcProbabilityOrdered(Config.Inst.CUT_PERCENT); tfProcessor = default(TFProcessor); GC.Collect(); Console.WriteLine("end calc probability"); #endregion #region [.-3-.] Console.Write("start write result..."); if (!Directory.Exists(bp.OutputDirectory)) { Directory.CreateDirectory(bp.OutputDirectory); } var nfi = new NumberFormatInfo() { NumberDecimalSeparator = "." }; var outputFile = Path.Combine(bp.OutputDirectory, Path.GetFileNameWithoutExtension(first_fi.Name) + $"-({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%){first_fi.Extension}"); using (var sw = new StreamWriter(outputFile, false, Config.Inst.OUTPUT_ENCODING)) { sw.WriteLine($"#\t'{first_fi.Name}' ({bp.Ngrams}-cut_{bp.CutPercent.GetValueOrDefault()}%)"); foreach (var tp in probabilityResult) { if (tp.Probability != 0) { sw.Write(tp.Term); sw.Write('\t'); sw.WriteLine(tp.Probability.ToString(nfi)); } } } var tempOutputFolder = Path.Combine(bp.InputDirectory, "temp"); if (Directory.Exists(tempOutputFolder) && !Directory.EnumerateFiles(tempOutputFolder, "*", SearchOption.TopDirectoryOnly).Any()) { Directory.Delete(tempOutputFolder, true); } Console.WriteLine($"end write result{Environment.NewLine}"); #endregion }